In [ ]:
# pip install openslide-python
# pip install openslide-bin
In [2]:
import os
from openslide import open_slide
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from openslide.deepzoom import DeepZoomGenerator
from skimage.filters import threshold_otsu
import math
import zipfile
import cv2
import skimage.filters
from skimage import color, measure
from skimage.feature import canny
from scipy.stats import entropy
from PIL import Image
In [3]:
# variables
# Define directory for saving tiles
TRAIN_DATASET_DIR = "train_dataset"
raw_train_images_path = "raw_train"
inp_size=224
overlap_pixels = 70
In [4]:
def unzip_all_files(directory):
# Iterate through all files in the directory
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.zip'):
file_path = os.path.join(root, file)
# Create a ZipFile object
with zipfile.ZipFile(file_path, 'r') as zip_ref:
# Extract all the contents into the same directory
zip_ref.extractall(root)
print(f"Unzipped: {file_path}")
unzip_all_files(raw_train_images_path)
In [5]:
raw_train_df = pd.read_csv('train.csv')
raw_train_df.head()
Out[5]:
| image_id | center_id | patient_id | image_num | label | |
|---|---|---|---|---|---|
| 0 | 006388_0 | 11 | 006388 | 0 | CE |
| 1 | 008e5c_0 | 11 | 008e5c | 0 | CE |
| 2 | 00c058_0 | 11 | 00c058 | 0 | LAA |
| 3 | 01adc5_0 | 11 | 01adc5 | 0 | LAA |
| 4 | 026c97_0 | 4 | 026c97 | 0 | CE |
In [6]:
temp = raw_train_df[raw_train_df['image_id'].isin(['f9fc6b_1'])]
temp
Out[6]:
| image_id | center_id | patient_id | image_num | label | |
|---|---|---|---|---|---|
| 726 | f9fc6b_1 | 11 | f9fc6b | 1 | LAA |
In [7]:
def get_all_files(directory):
file_list = []
# Walk through the directory recursively
for root, dirs, files in os.walk(directory):
for file in files:
# Append the file name to the list
if file.endswith(".tif"):
file_list.append(file.split(".")[0])
return file_list
# Example usage
list_of_available_images = get_all_files(raw_train_images_path)
In [8]:
train_df = raw_train_df[raw_train_df['image_id'].isin(list_of_available_images)]
In [9]:
train_df.label.value_counts()
Out[9]:
label CE 22 LAA 21 Name: count, dtype: int64
In [10]:
def form_train_images_paths(x):
current_file_path = os.path.join(raw_train_images_path, x+'.tif')
if os.path.isfile(current_file_path):
return current_file_path
elif os.path.isdir(current_file_path):
return os.path.join(current_file_path , x+'.tif')
train_df.image_id = train_df.image_id.apply(form_train_images_paths)
C:\Users\Raghava\AppData\Local\Temp\ipykernel_15376\3057052148.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train_df.image_id = train_df.image_id.apply(form_train_images_paths)
In [11]:
train_df.drop(columns=["center_id", "patient_id", "image_num"], inplace=True)
C:\Users\Raghava\AppData\Local\Temp\ipykernel_15376\4186856123.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train_df.drop(columns=["center_id", "patient_id", "image_num"], inplace=True)
In [12]:
# train_df.head()
train_df.reset_index(inplace = True, drop = True)
In [13]:
train_df.head(40)
Out[13]:
| image_id | label | |
|---|---|---|
| 0 | raw_train\008e5c_0.tif\008e5c_0.tif | CE |
| 1 | raw_train\00c058_0.tif | LAA |
| 2 | raw_train\01adc5_0.tif | LAA |
| 3 | raw_train\026c97_0.tif\026c97_0.tif | CE |
| 4 | raw_train\032f10_0.tif\032f10_0.tif | CE |
| 5 | raw_train\0372b0_0.tif\0372b0_0.tif | CE |
| 6 | raw_train\055f6a_0.tif | LAA |
| 7 | raw_train\0aaeb3_0.tif\0aaeb3_0.tif | CE |
| 8 | raw_train\0aff58_0.tif\0aff58_0.tif | CE |
| 9 | raw_train\0cc0bc_0.tif\0cc0bc_0.tif | CE |
| 10 | raw_train\0e696a_1.tif\0e696a_1.tif | CE |
| 11 | raw_train\0ed87f_0.tif\0ed87f_0.tif | CE |
| 12 | raw_train\0ed87f_1.tif\0ed87f_1.tif | CE |
| 13 | raw_train\0ff890_0.tif\0ff890_0.tif | CE |
| 14 | raw_train\15aab4_0.tif\15aab4_0.tif | CE |
| 15 | raw_train\20d5cf_0.tif\20d5cf_0.tif | CE |
| 16 | raw_train\2a25b9_0.tif\2a25b9_0.tif | LAA |
| 17 | raw_train\369366_0.tif\369366_0.tif | CE |
| 18 | raw_train\4f6fb1_1.tif\4f6fb1_1.tif | CE |
| 19 | raw_train\4f9ac6_0.tif\4f9ac6_0.tif | CE |
| 20 | raw_train\c61b26_0.tif | CE |
| 21 | raw_train\c78d6b_0.tif | CE |
| 22 | raw_train\c92b51_0.tif | CE |
| 23 | raw_train\ca2f66_0.tif | CE |
| 24 | raw_train\ca7b57_0.tif | LAA |
| 25 | raw_train\caf901_0.tif | CE |
| 26 | raw_train\d862da_0.tif | LAA |
| 27 | raw_train\d8db68_0.tif | LAA |
| 28 | raw_train\dae554_0.tif | LAA |
| 29 | raw_train\dba56f_0.tif | LAA |
| 30 | raw_train\de9e4c_0.tif | LAA |
| 31 | raw_train\e251ff_0.tif | CE |
| 32 | raw_train\e26a04_0.tif | LAA |
| 33 | raw_train\e26a04_1.tif | LAA |
| 34 | raw_train\ed5006_0.tif | LAA |
| 35 | raw_train\f3e9f6_0.tif | LAA |
| 36 | raw_train\f3e9f6_1.tif | LAA |
| 37 | raw_train\f40c69_0.tif | LAA |
| 38 | raw_train\f40c69_1.tif | LAA |
| 39 | raw_train\f7fb11_0.tif | LAA |
In [14]:
def understand_single_WSI(id,show_gridded_tiles=False):
file_path = train_df.loc[id, "image_id"]
label = train_df.loc[id, "label"]
print(f"Filepath: {file_path}, Label: {label}")
slide = open_slide(file_path)
print(slide.properties)
print(f"Slide Dimensions: {slide.dimensions}")
print(f"Slide Level Dimensions: {slide.level_dimensions}")
#in the slide object, we have only 1 level bcoz it is the way it was stored originally
#so that is why we can only view images of original resolution
# Read a small cropped region for visualization
small_region = slide.read_region((1600, 28800), 0, (inp_size, inp_size)).convert("RGB") # (x,y) from top-left, level, (width,height)
small_region = np.array(small_region)
plt.figure(figsize=(3, 3))
plt.title(f"Random cropped slide for image {file_path} belonging to {label} class.")
plt.imshow(small_region)
plt.axis("off") # Hide axes
plt.show()
# DeepZoomGenerator for tiling
tiles = DeepZoomGenerator(slide, tile_size=inp_size, overlap=0, limit_bounds=False)
level_from_last_to_focus = 1
focus_level = tiles.level_count - level_from_last_to_focus
print(f"Total levels: {tiles.level_count}")
print(f"Dimensions per level: {tiles.level_dimensions}")
print(f"Total tiles: {tiles.tile_count}")
print(f"Printing information for level {level_from_last_to_focus} from last:")
print(f"Tile grid size at this level: {tiles.level_tiles[focus_level]}")
print(f"Each tile's dimension: {tiles.get_tile_dimensions(focus_level, (0, 0))}")
if show_gridded_tiles:
cols, rows = tiles.level_tiles[focus_level]
# Set up grid for visualizationshow_gridded_tiles
fig, axes = plt.subplots(rows, cols, figsize=(cols * 2, rows * 2))
if rows == 1 and cols == 1:
axes = np.array([[axes]]) # Ensure 2D array for single tile
elif rows == 1 or cols == 1:
axes = np.expand_dims(axes, axis=0 if rows == 1 else 1) # Expand to 2D
for row in range(rows):
for col in range(cols):
file_name = f"{file_path}_{col}_{row}_{label}.png"
tile = tiles.get_tile(focus_level, (col, row)).convert("RGB")
tile = np.array(tile)
ax = axes[row, col]
ax.set_title(file_name, fontsize=6)
ax.imshow(tile)
ax.axis("off") # Hide axes for clean visualization
plt.tight_layout()
plt.show()
understand_single_WSI(0)
Filepath: raw_train\008e5c_0.tif\008e5c_0.tif, Label: CE
<_PropertyMap {'openslide.level-count': '1', 'openslide.level[0].downsample': '1', 'openslide.level[0].height': '29694', 'openslide.level[0].tile-height': '128', 'openslide.level[0].tile-width': '128', 'openslide.level[0].width': '5946', 'openslide.mpp-x': '1000', 'openslide.mpp-y': '1000', 'openslide.vendor': 'generic-tiff', 'tiff.ResolutionUnit': 'centimeter', 'tiff.XResolution': '10', 'tiff.YResolution': '10'}>
Slide Dimensions: (5946, 29694)
Slide Level Dimensions: ((5946, 29694),)
Total levels: 16 Dimensions per level: ((1, 1), (1, 2), (1, 4), (2, 8), (3, 15), (6, 29), (12, 58), (24, 116), (47, 232), (93, 464), (186, 928), (372, 1856), (744, 3712), (1487, 7424), (2973, 14847), (5946, 29694)) Total tiles: 4871 Printing information for level 1 from last: Tile grid size at this level: (27, 133) Each tile's dimension: (224, 224)
In [15]:
understand_single_WSI(1)
Filepath: raw_train\00c058_0.tif, Label: LAA
<_PropertyMap {'openslide.level-count': '1', 'openslide.level[0].downsample': '1', 'openslide.level[0].height': '61801', 'openslide.level[0].tile-height': '128', 'openslide.level[0].tile-width': '128', 'openslide.level[0].width': '15255', 'openslide.mpp-x': '1000', 'openslide.mpp-y': '1000', 'openslide.vendor': 'generic-tiff', 'tiff.ResolutionUnit': 'centimeter', 'tiff.XResolution': '10', 'tiff.YResolution': '10'}>
Slide Dimensions: (15255, 61801)
Slide Level Dimensions: ((15255, 61801),)
Total levels: 17 Dimensions per level: ((1, 1), (1, 2), (1, 4), (2, 8), (4, 16), (8, 31), (15, 61), (30, 121), (60, 242), (120, 483), (239, 966), (477, 1932), (954, 3863), (1907, 7726), (3814, 15451), (7628, 30901), (15255, 61801)) Total tiles: 25571 Printing information for level 1 from last: Tile grid size at this level: (69, 276) Each tile's dimension: (224, 224)
In [19]:
def create_dirs(label):
"""Creates directory structure for saving tiles."""
label_dir = os.path.join(TRAIN_DATASET_DIR, str(label))
os.makedirs(label_dir, exist_ok=True)
return label_dir
def contains_dark_patches(tile, dark_threshold=50, dark_ratio=0.05):
"""
Checks if a tile contains any dark patches beyond a certain threshold.
"""
gray_tile = np.mean(np.array(tile), axis=-1)
dark_pixels = np.sum(gray_tile < dark_threshold)
total_pixels = gray_tile.size
return (dark_pixels / total_pixels) > dark_ratio
def is_blurry(tile, laplacian_threshold=200):
"""
Checks if a tile is blurry using the variance of the Laplacian method.
"""
gray_tile = cv2.cvtColor(np.array(tile), cv2.COLOR_RGB2GRAY)
laplacian_var = cv2.Laplacian(gray_tile, cv2.CV_64F).var()
return laplacian_var < laplacian_threshold
def is_useful_tile(tile, intensity_threshold=150, variance_threshold=500):
"""
Filters tiles based on:
- Mean intensity (removes mostly white/empty tiles)
- Variance threshold (removes low-information tiles)
- Full coverage of nuclei, plasma, and cytoplasm
- No dark patches
- No blurriness
"""
tile_array = np.array(tile, dtype=np.uint8)
gray_tile = np.mean(tile_array, axis=-1)
mean_intensity = np.mean(gray_tile)
variance = np.var(gray_tile)
return (
mean_intensity < intensity_threshold and mean_intensity > 90 and
variance > variance_threshold and
not contains_dark_patches(tile) and
not is_blurry(tile)
)
def preprocess_single_WSI(id, plot_all_grided_images=False, save_files=False,preprocess_tiles=True):
"""Extracts, filters, and saves important tiles from a whole-slide image."""
file_path = train_df.loc[id, "image_id"]
label = train_df.loc[id, "label"]
print(f"Processing: {file_path}, Label: {label}")
base_filename = os.path.basename(file_path).replace("/", "_").replace("\\", "_")
slide = open_slide(file_path)
tiles = DeepZoomGenerator(slide, tile_size=inp_size, overlap=overlap_pixels, limit_bounds=False)
focus_level = tiles.level_count - 1
cols, rows = tiles.level_tiles[focus_level]
save_dir = create_dirs(label)
useful_tiles = []
useful_tile_count = 0
for row in range(rows):
for col in range(cols):
tile = tiles.get_tile(focus_level, (col, row)).convert("RGB")
if is_useful_tile(tile):
print("tile found useful")
if preprocess_tiles : tile = preprocess_tile(tile)
# if not is_useful_tile(tile):continue
if save_files:
file_name = f"{base_filename}_{col}_{row}.png"
save_path = os.path.join(save_dir, file_name)
tile.save(save_path)
useful_tiles.append(np.array(tile))
useful_tile_count += 1
if (not save_files) and useful_tile_count >= 20:
break
if (not save_files) and useful_tile_count >= 20:
break
print(f"Saved {useful_tile_count} useful tiles in '{save_dir}'.")
if plot_all_grided_images and useful_tiles:
grid_size = math.ceil(math.sqrt(len(useful_tiles)))
fig, axes = plt.subplots(grid_size, grid_size, figsize=(12, 12))
axes = axes.flatten() if isinstance(axes, np.ndarray) else [axes]
for i, tile_img in enumerate(useful_tiles):
axes[i].imshow(tile_img)
axes[i].axis("off")
for j in range(i + 1, len(axes)):
axes[j].axis("off")
plt.suptitle(f"Tiled Images for {base_filename} (Label: {label})", fontsize=14)
plt.show()
In [28]:
def norm_HnE(img, Io=250, alpha=1, beta=0.15):
######## Step 1: Convert RGB to OD ###################
## reference H&E OD matrix.
#Can be updated if you know the best values for your image.
#Otherwise use the following default values.
#Read the above referenced papers on this topic.
HERef = np.array([[0.5626, 0.2159],
[0.7201, 0.8012],
[0.4062, 0.5581]])
### reference maximum stain concentrations for H&E
maxCRef = np.array([1.9705, 1.0308])
# extract the height, width and num of channels of image
h, w, c = img.shape
# reshape image to multiple rows and 3 columns.
#Num of rows depends on the image size (wxh)
img = img.reshape((-1,3))
# calculate optical density
# OD = −log10(I)
#OD = -np.log10(img+0.004) #Use this when reading images with skimage
#Adding 0.004 just to avoid log of zero.
OD = -np.log10((img.astype(float)+1)/Io) #Use this for opencv imread
#Add 1 in case any pixels in the image have a value of 0 (log 0 is indeterminate)
############ Step 2: Remove data with OD intensity less than β ############
# remove transparent pixels (clear region with no tissue)
ODhat = OD[~np.any(OD < beta, axis=1)] #Returns an array where OD values are above beta
#Check by printing ODhat.min()
############# Step 3: Calculate SVD on the OD tuples ######################
#Estimate covariance matrix of ODhat (transposed)
# and then compute eigen values & eigenvectors.
eigvals, eigvecs = np.linalg.eigh(np.cov(ODhat.T))
######## Step 4: Create plane from the SVD directions with two largest values ######
#project on the plane spanned by the eigenvectors corresponding to the two
# largest eigenvalues
That = ODhat.dot(eigvecs[:,1:3]) #Dot product
############### Step 5: Project data onto the plane, and normalize to unit length ###########
############## Step 6: Calculate angle of each point wrt the first SVD direction ########
#find the min and max vectors and project back to OD space
phi = np.arctan2(That[:,1],That[:,0])
minPhi = np.percentile(phi, alpha)
maxPhi = np.percentile(phi, 100-alpha)
vMin = eigvecs[:,1:3].dot(np.array([(np.cos(minPhi), np.sin(minPhi))]).T)
vMax = eigvecs[:,1:3].dot(np.array([(np.cos(maxPhi), np.sin(maxPhi))]).T)
# a heuristic to make the vector corresponding to hematoxylin first and the
# one corresponding to eosin second
if vMin[0] > vMax[0]:
HE = np.array((vMin[:,0], vMax[:,0])).T
else:
HE = np.array((vMax[:,0], vMin[:,0])).T
# rows correspond to channels (RGB), columns to OD values
Y = np.reshape(OD, (-1, 3)).T
# determine concentrations of the individual stains
C = np.linalg.lstsq(HE,Y, rcond=None)[0]
# normalize stain concentrations
maxC = np.array([np.percentile(C[0,:], 99), np.percentile(C[1,:],99)])
tmp = np.divide(maxC,maxCRef)
C2 = np.divide(C,tmp[:, np.newaxis])
###### Step 8: Convert extreme values back to OD space
# recreate the normalized image using reference mixing matrix
Inorm = np.multiply(Io, np.exp(-HERef.dot(C2)))
Inorm[Inorm>255] = 254
Inorm = np.reshape(Inorm.T, (h, w, 3)).astype(np.uint8)
# Separating H and E components
H = np.multiply(Io, np.exp(np.expand_dims(-HERef[:,0], axis=1).dot(np.expand_dims(C2[0,:], axis=0))))
H[H>255] = 254
H = np.reshape(H.T, (h, w, 3)).astype(np.uint8)
E = np.multiply(Io, np.exp(np.expand_dims(-HERef[:,1], axis=1).dot(np.expand_dims(C2[1,:], axis=0))))
E[E>255] = 254
E = np.reshape(E.T, (h, w, 3)).astype(np.uint8)
return (Inorm, H, E)
def preprocess_tile(image):
"""
Enhanced preprocessing pipeline for WSI tiles:
1. **H&E Normalization**
2. **Contrast & Color Normalization**
3. **Feature Highlighting** (CE: fibrin, LAA: RBC clusters)
4. **Edge & Structure Sharpening**
"""
image = np.array(image)
image,H,E = norm_HnE(image)
return Image.fromarray(image)
hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
hsv[:, :, 2] = cv2.equalizeHist(hsv[:, :, 2])
normalized = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
gray = cv2.cvtColor(normalized, cv2.COLOR_RGB2GRAY)
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
laplacian = cv2.Laplacian(enhanced, cv2.CV_64F)
sharpened = cv2.convertScaleAbs(enhanced - 0.5 * laplacian)
blurred = cv2.GaussianBlur(sharpened, (5, 5), 0)
edges = cv2.Canny(blurred, threshold1=30, threshold2=100)
enhanced_features = cv2.addWeighted(sharpened, 0.8, edges, 0.2, 0)
kernel = np.ones((3, 3), np.uint8)
enhanced_fibrin = cv2.morphologyEx(enhanced_features, cv2.MORPH_CLOSE, kernel)
return Image.fromarray(enhanced_fibrin)
In [20]:
id = 12
preprocess_single_WSI(id=id, plot_all_grided_images=True,preprocess_tiles = False)
Processing: raw_train\0ed87f_1.tif\0ed87f_1.tif, Label: CE tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful Saved 20 useful tiles in 'train_dataset\CE'.
In [21]:
id = 39
preprocess_single_WSI(id=id, plot_all_grided_images=True,preprocess_tiles = False)
Processing: raw_train\f7fb11_0.tif, Label: LAA tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful Saved 20 useful tiles in 'train_dataset\LAA'.
In [23]:
id = 12
preprocess_single_WSI(id=id, plot_all_grided_images=True,preprocess_tiles = True)
Processing: raw_train\0ed87f_1.tif\0ed87f_1.tif, Label: CE tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful Saved 20 useful tiles in 'train_dataset\CE'.
In [24]:
id = 39
preprocess_single_WSI(id=id, plot_all_grided_images=True,preprocess_tiles = True)
Processing: raw_train\f7fb11_0.tif, Label: LAA tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful Saved 20 useful tiles in 'train_dataset\LAA'.
In [26]:
id = 12
preprocess_single_WSI(id=id, plot_all_grided_images=True,preprocess_tiles = True)
Processing: raw_train\0ed87f_1.tif\0ed87f_1.tif, Label: CE tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful Saved 20 useful tiles in 'train_dataset\CE'.
In [27]:
id = 39
preprocess_single_WSI(id=id, plot_all_grided_images=True,preprocess_tiles = True)
Processing: raw_train\f7fb11_0.tif, Label: LAA tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful tile found useful Saved 20 useful tiles in 'train_dataset\LAA'.
In [ ]:
for idx in train_df.index:
preprocess_single_WSI(idx, plot_all_grided_images=False, save_files=True, preprocess_tiles=True)
In [ ]: